------------------------------------------------------------------------------------------------------------------------------------------
      name:  <unnamed>
       log:  E:\REStat_MS14767_Vol96(2)\Data preparation Compustat\estimation_sample.log
  log type:  text
 opened on:  18 Dec 2014, 17:02:14

. *---------------------------------------------------------------
. 
. 
. ********************************************************************************
. ********      in this file we create the final estimation sample  **************
. ********************************************************************************
. 
. use "compustat_master_america.dta", clear

. so ticker year

. keep  ticker vantagekey name countryinc employees sales forn_sal ta year SICALL2-SICALL15 SIC4 CUSIP rdexpense mkvalq

. rename CUSIP cusip

. 
. gen SIC2=int(SIC4/100)

. label var SIC2 "SIC4/100"

. 
. 
. 
. ********************************************************
. *****          here we add the variables related to RJV participation (see RJVpartic.do)
. ********************************************************
. 
. merge ticker year using RJVvars.dta

. tab _merge

     _merge |      Freq.     Percent        Cum.
------------+-----------------------------------
          1 |    266,644       94.56       94.56
          3 |     15,330        5.44      100.00
------------+-----------------------------------
      Total |    281,974      100.00

. drop _merge

. 
. ********************************************************
. *****          here we add the variables related to links(see links.do)
. ********************************************************
. 
. so ticker year

. merge ticker year using links.dta
(note: variable year was int, now float to accommodate using data's values)

. tab _merge

     _merge |      Freq.     Percent        Cum.
------------+-----------------------------------
          1 |    275,987       97.88       97.88
          3 |      5,987        2.12      100.00
------------+-----------------------------------
      Total |    281,974      100.00

. drop _merge

. 
. 
. ********************************************************
. *****           here we add the variables related to patents (see patentvars.do)
. ********************************************************
. 
. so cusip year

. merge cusip year using "patentvars.dta"

. tab _merge

     _merge |      Freq.     Percent        Cum.
------------+-----------------------------------
          1 |    272,216       96.54       96.54
          3 |      9,758        3.46      100.00
------------+-----------------------------------
      Total |    281,974      100.00

. drop _merge

. 
. ************************************************************
. ***             some other patent vars are added (see patentpanel.do)
. ************************************************************
. 
. so ticker year

. merge ticker year using "patentpanel.dta"

. tab _merge

     _merge |      Freq.     Percent        Cum.
------------+-----------------------------------
          1 |    260,806       84.77       84.77
          2 |     25,704        8.35       93.12
          3 |     21,168        6.88      100.00
------------+-----------------------------------
      Total |    307,678      100.00

. drop if _merge==2
(25704 observations deleted)

. drop _merge

. 
. ************************************************************
. *** Remark: Don't care about observations with _merge==2,
. *** they are due to different time intervals, not to different
. *** firms. That is, patent data starts already in 1969 whereas
. *** Compustat starts in 1986. 
. **************************************************************
. 
. 
. **********************************************************
. *****           WE KEEP ONLY AMERICAN FIRMS          *****
. *****             FOR WHICH WE HAVE SALES            *****
. **********************************************************
. 
. count if countryinc!=0
    0

. 
. * sales are changed into sales_new by substracting forn_sales
. 
. replace forn_sales=0 if  forn_sales==.
(280078 real changes made)

. gen sales_new=sales- forn_sales
(169281 missing values generated)

. count if sales_new<0
   84

. replace sales_new=. if sales_new<0
(84 real changes made, 84 to missing)

. count if sales_new==.
169365

. drop if sales_new==.
(169365 observations deleted)

. label var sales_new "Sales - foreign sales"

. 
. compress
  year was float now int
  SIC2 was float now byte
  RJVtotal was float now int
  RJV was float now byte
  RJVsametot was float now int
  RJVdifftot was float now int
  RJVsame was float now byte
  RJVdiff was float now byte
  RJVnfirms was float now int
  RJVnsame was float now int
  RJVndiff was float now int
  links1_tot was float now byte
  links2_tot was float now byte
  links1 was float now byte
  links2 was float now byte
  links1_same was float now byte
  links2_same was float now byte
  links1_diff was float now byte
  links2_diff was float now byte
  numpatentsy was float now int
  ticker was str8 now str7
  (5,968,277 bytes saved)

. drop countryinc vantagekey sales forn_sales

. 
. 
. 
. ********************* we correct some variables  ****************
. ********************* and generate some others     ****************
. 
. so ticker year

. egen firmnum=group(ticker)

. label var firmnum "group(ticker)"

. egen ncomp=count(firmnum), by(SIC4 year)

. label var ncomp "number of competitors per year/SIC4"

. 
. * this is just to control that there are no mistake in the panel
. * if everythnig is correct no observation should be dropped!!
. drop if firmnum==firmnum[_n-1]& year==year[_n-1]
(0 observations deleted)

. 
. 
. ********************************************************
. *****           
. *****           here we define market shares (our dep. var.) defining the market as the SIC4 industry
. *****           
. ********************************************************
. 
. egen tot_sales=sum(sales_new), by (SIC4 year)

. label var tot_sales "sum of sales by SIC4, year"

. gen MS=sales_new/tot_sales
(5 missing values generated)

. label var MS "Market shares firm/year at the SIC4 level"

. 
. gen aa=MS*MS
(5 missing values generated)

. egen HHI=sum(aa), by(SIC4 year)

. label var HHI "HHI index at the SIC4 level"

. drop aa

. 
. 
. ********************************************************
. *****           
. *****           NETWORK
. *****           
. ********************************************************
. 
. 
. replace links1_tot=0 if  links1_tot==.
(112609 real changes made)

. replace links2_tot=0 if  links2_tot==.
(112609 real changes made)

. replace links2=0 if links2==.
(112609 real changes made)

. replace links2_same=0 if links2_same==.
(112609 real changes made)

. replace links2_diff=0 if links2_diff==.
(112609 real changes made)

. 
. gen links1_ver= links1_tot-links1
(112609 missing values generated)

. label var links1_ver "Pure vertical links (i.e. links with non-competitors) with double counting"

. gen links2_ver= links2_tot-links2

. label var links2_ver "Pure vertical links (i.e. links with non-competitors) without double counting"

. 
. 
. gen network=links2/(ncomp-1)
(262 missing values generated)

. label var network "links2/competitors"

. 
. * we have to make a small correction since for some obs. the number of links is larger than the number of competitors (# of firms -1)
. * this is because we have 11 firms in the RJV that have no sales and we dropped these observations
. * these firms are: DEC  DGN.    FMC     GCHI    HLN     HPQ     IDL.1   LCE     MSA.1   NSTS    SDW
. 
. replace network=1 if links2>ncomp-1
(0 real changes made)

. replace network=0 if network==.
(262 real changes made)

. 
. gen network2=network*network

. label var network2 "network squared"

. 
. 
. gen network_ver=links2_ver/(ncomp-1)
(262 missing values generated)

. label var network "links2_ver/competitors"

. 
. gen network_ver2=network_ver*network_ver
(262 missing values generated)

. label var network "network_ver squared"

. 
.                 **** dividing between RJV in the same SIC industry 
.                 **** and in different SIC industry
. 
. gen network_s=links2_same/(ncomp-1)
(262 missing values generated)

. label var network_s "same ind. links2/competitors"

. 
. * we have to make a small correction since for some obs. the number of links is larger than the number of competitors (# of firms -1)
. 
. replace network_s=1 if links2_same>ncomp-1
(0 real changes made)

. replace network_s=0 if network_s==.
(262 real changes made)

. 
. gen network_s2=network_s*network_s

. label var network_s2 "network_s squared"

. 
. gen network_d=links2_diff/(ncomp-1)
(262 missing values generated)

. label var network_d "diff. ind. links2/competitors"

. 
. * we have to make a small correction since for some obs. the number of links is larger than the number of competitors (# of firms -1)
. replace network_d=1 if links2_diff>ncomp-1
(0 real changes made)

. replace network_d=0 if network_d==.
(262 real changes made)

. 
. gen network_d2=network_d*network_d

. label var network_d2 "network_d squared"

. 
. 
. 
. 
. ********************************************************
. *****           
. *****           RJV in same/different industry
. *****           
. ********************************************************
. 
. drop  RJVsame RJVdiff

. gen RJVsame=0

. replace RJVsame=1 if links2>0
(0 real changes made)

. label var RJVsame "=1 if the firm meets at least one competitior in the RJV(s)"

. 
. gen RJVdiff=0

. replace RJVdiff= RJV-RJVsame if RJV>0
(108420 real changes made, 102433 to missing)

. label var RJVdiff "=1 if the firm does not meet any competitior in the RJV(s)"

. 
. gen lRJVtotal=log(RJVtotal+1)
(102433 missing values generated)

. label var lRJVtotal "log(RJVtotal+1)"

. 
. 
. 
. ********************************************************
. *****           
. *****           here we generate some additional variables
. *****           
. ********************************************************
. 
. use "E:\REStat_MS14767_Vol96(2)\Estimation\estimation_sample", clear

. so ticker year

. merge ticker year using "networkMS.dta"
(note: variable ticker was str7, now str8 to accommodate using data's values)
(note: variable year was int, now float to accommodate using data's values)

. drop _merge

. so ticker year

. merge ticker year using "asset.dta"

. drop if _merge==2
(169365 observations deleted)

. 
. gen lta=log(ta+1)
(178 missing values generated)

. label var ta "Log total assents in millions dollars"

. drop _merge

. 
. * here we add the infos about the network built on the largest RJV (max_links)
. 
. so ticker year

. 
. merge ticker year using "max_links.dta"

. 
. gen network_n=links2_n/(ncomp-1)
(106641 missing values generated)

. gen network_max=max_links2/(ncomp-1)
(106641 missing values generated)

. gen network_av=network/RJVsametot
(108872 missing values generated)

. gen network_av_n=network_n/RJVsametot
(108887 missing values generated)

. 
. drop _merge

. so SIC2 year

. merge SIC2 year using "import_dataSIC2.dta"
variables SIC2 year do not uniquely identify observations in the master data
(note: variable SIC2 was byte, now float to accommodate using data's values)
variables SIC2 year do not uniquely identify observations in import_dataSIC2.dta

. 
. 
. **** We create a new measure of MS to weight the MS of those (conglomerate) firms which are in several SICs
. * one possibility for weighting is to give each first sic code a weight
. * of 2/(n+1), and subsequent sics would get either each 1/(n+1) or some
. * further scaling down, but what weight exactlz we would give to subsequent
. * ones is at this point not so relevant, since we would give them now zero.
. * one could give the argumentation that sics are not ranked according to
. * importance, only the first one (which i think is true)
. 
. egen SICn=rownonmiss(SIC4 SICALL2 SICALL3 SICALL4 SICALL5 SICALL6 SICALL7 SICALL8 SICALL9 SICALL10 SICALL11 SICALL12 SICALL13 SICALL14 S
> ICALL15)

. gen MSw=MS*2/(1+SICn)
(5 missing values generated)

. 
. * here we first define those industries where no RJV was formed
. * these observations will represent the "zeros"
. 
. gen aa=RJV
(102433 missing values generated)

. egen bb=max(aa), by(SIC4)
(30350 missing values generated)

. replace bb=0 if bb==.
(30350 real changes made)

. gen zeros=0

. replace zeros=1 if bb==0
(30531 real changes made)

. drop aa bb

. 
. gen aa=RJVsame

. egen bb=max(aa), by(SIC4)

. gen zero_same=0

. replace zero_same=1 if bb==0
(61253 real changes made)

. drop aa bb

. 
. replace patents=0 if patents==.
(98150 real changes made)

. replace patents_t_1=0 if patents_t_1==.
(98150 real changes made)

. replace patents_t_2=0 if patents_t_2==.
(98150 real changes made)

. replace patents_t_3=0 if patents_t_3==.
(98150 real changes made)

. replace patents_t_4=0 if patents_t_4==.
(98150 real changes made)

. 
. 
. gen only_ver=0

. replace only_ver=1 if network==0 & RJV==1
(2366 real changes made)

. 
. 
. * here we redefine the RJV dummies
. 
. gen RJV2=RJV
(102433 missing values generated)

. replace RJV2=0 if RJV2==.
(102433 real changes made)

. 
. * horizontal vs. vertical
. 
. gen RJVsame2=RJVsame

. replace RJVsame2=0 if RJVsame2==.
(0 real changes made)

. rename RJVsame2 RJVhor

. label var RJVhor "RJV with firms from the same industry"

. 
. 
. gen RJVdiff2=RJVdiff
(102433 missing values generated)

. replace RJVdiff2=0 if RJVdiff==.
(102433 real changes made)

. rename RJVdiff2 RJVver

. label var RJVver "RJV without firms from the same industry"

. 
. * horizontal in the same industry vs. different industry
. 
. gen RJVsame2=0

. replace RJVsame2=1 if network_s>0
(766 real changes made)

. label var RJVsame2 "RJV with competitors in the same industry"

. 
. gen RJVdiff2=0

. replace RJVdiff2=1 if RJVsame2==0 & RJVhor==1
(2855 real changes made)

. label var RJVsame2 "RJV with competitors in different industries"

. 
. 
. * we replace the missing with zeros in the r&d variable 
. 
. replace ta=0 if ta==.
(178 real changes made)

. gen rd2=rdexpense
(60384 missing values generated)

. replace rd2=0 if rd2==. | rd2<0
(60387 real changes made)

. gen lrd2=log(rd2+1)

. 
. * we create a R&D intensity variable
. gen rd_int=rd2/sales
(2805 missing values generated)

. 
. 
. * we build a r&d stock
. 
. so ticker year

. by ticker: gen yearweight=year[_n+1]-year if year!=year[_n+1]
(16567 missing values generated)

. gen xxx=rdexpense
(60384 missing values generated)

. replace xxx=0 if xxx==.
(60384 real changes made)

. so ticker year

. by ticker: gen rd_stock=xxx+xxx[_n-1]*(1-0.15)^yearweight if year!=year[_n-1]
(31529 missing values generated)

. label var rd_stock "r&d stock, depreciated"

. drop xxx

. replace rd_stock=rd_stock[_n-1] if year==year & ticker==ticker[_n-1] & rd_stock==.
(13005 real changes made)

. replace rd_stock=0 if rd_stock==.
(18524 real changes made)

. so ticker year

. 
. replace rd_stock=0 if rd_stock<0
(4 real changes made)

. gen lrd_stock=log(rd_stock+1)

. 
. 
. * we generate industry specific variables
. 
. egen m_rd=mean(rd2), by(SIC4 year)

. label var m_rd "Mean R&D exp. at the SIC4/year level

. 
. egen m_lrd=mean(lrd2), by(SIC4 year)

. label var m_rd "Mean log R&D exp. at the SIC4/year level

. 
. egen m_rdexpense=mean(rdexpense), by(SIC4 year)
(17122 missing values generated)

. label var m_rdexpense "Mean R&D exp. at the SIC4/year level"

. 
. gen lmkval=log(mkvalq)
(22938 missing values generated)

. egen m_lmkval=mean(lmkval), by(SIC4 year)
(97 missing values generated)

. label var m_lmkval "Mean log MV at the SIC4/year level"

. 
. egen m_mkval=mean(mkval), by(SIC4 year)
(97 missing values generated)

. label var m_mkval "Mean MV at the SIC4/year level"

. 
. 
. *gen lmkval=log(mkvalq)
. gen lmkval2=lmkval
(22938 missing values generated)

. replace lmkval2=0 if lmkval==.
(22938 real changes made)

. 
. egen m_lmkval2=mean(lmkval2), by(SIC4 year)

. label var m_lmkval "Mean log MV at the SIC4/year level

. 
. gen only_horiz=0

. replace only_horiz=1 if network>0
(3621 real changes made)

. 
. egen nRJVfirm_hor=sum(only_horiz), by(SIC4 year)

. gen coverage= nRJVfirm_hor/ncom

. replace coverage=0 if network==0
(34876 real changes made)

. gen coverage2=coverage*coverage

. 
. gen only_horiz_s=0

. replace only_horiz_s=1 if network_s>0
(766 real changes made)

. 
. 
. *** we generate the vertical links
. 
. replace links2_tot=0 if links2_tot==.
(106622 real changes made)

. gen links2_ver=links2_tot-links2_same-links2_diff

. gen links_ver=links2_ver

. *label var RJVeff2 "Heterogenous effect (RJV dummies)"
. *label var RJVeff_s2 "Heterogenous effect (RJV dummies)"
. label var network "RJV network coverage"

. label var network_s "RJV network coverage"

. label var network_d "RJV network coverage"

. label var networkMS "RJV network coverage in MS"

. label var networkMS_s "RJV network coverage in MS"

. label var networkMS_d "RJV network coverage in MS"

. label var RJV2 "RJV"

. label var RJVver "Vertical RJV"

. label var RJVhor "Horizontal RJV"

. label var links_ver "Vertical links"

. label var lrd2 "log(R&D)"

. label var m_lmkval "log(MV)"

. label var m_lmkval "log(MV) SIC4"

. label var m_lrd "log(R&D) SIC4"

. 
. gen RJVhor_s=0

. replace RJVhor_s=1 if network>0 & network<.031746
(905 real changes made)

. 
. gen RJVhor_m=0

. replace RJVhor_m=1 if network>=.031746 & network<.1948052
(1811 real changes made)

. 
. gen RJVhor_l=0

. replace RJVhor_l=1 if network>=.1948052
(905 real changes made)

. 
. keep sales_new ta rd2 rd_int patents network network2 networkMS coverage RJV RJV2 RJVhor RJVver RJVhor_s RJVhor_m RJVhor_l RJVsametot li
> nks2_ver m_rdexpense  m_lmkval2 ncomp lrd2 m_lmkval m_lrd patents_t_1 patents_t_2 patents_t_3 lta lrd2 m_lrd m_lmkval SIC4 firmnum year 
> MS zero_same dy* ticker MSw

. 
. save RESTATestimation_sample_final.dta, replace
(note: file RESTATestimation_sample_final.dta not found)
file RESTATestimation_sample_final.dta saved

. 
. 
. 
. log close
      name:  <unnamed>
       log:  E:\REStat_MS14767_Vol96(2)\Data preparation Compustat\estimation_sample.log
  log type:  text
 closed on:  18 Dec 2014, 17:09:08
------------------------------------------------------------------------------------------------------------------------------------------
